site <- "http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv"
AD <- read.csv(site)
head(AD)
## X TV Radio Newspaper Sales
## 1 1 230.1 37.8 69.2 22.1
## 2 2 44.5 39.3 45.1 10.4
## 3 3 17.2 45.9 69.3 9.3
## 4 4 151.5 41.3 58.5 18.5
## 5 5 180.8 10.8 58.4 12.9
## 6 6 8.7 48.9 75.0 7.2
dim(AD)
## [1] 200 5
library(DT)
datatable(AD[, -1], rownames = FALSE,
caption = 'Table 1: This is a simple caption for the table.')
plot(Sales ~ TV, data = AD, col = "red", pch = 19)
mod1 <- lm(Sales ~ TV, data = AD)
abline(mod1, col = "blue")
ggplot2library(ggplot2)
library(MASS)
p <- ggplot(data = AD, aes(x = TV, y = Sales)) +
geom_point(color = "lightblue") +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
geom_smooth(method = "loess", color = "red", se = FALSE) +
geom_smooth(method = "rlm", color = "purple", se = FALSE) +
theme_bw()
p
ggvislibrary(ggvis)
AD %>%
ggvis(x = ~TV, y = ~Sales) %>%
layer_points() %>%
layer_model_predictions(model = "lm", se = FALSE) %>%
layer_model_predictions(model = "MASS::rlm", se = FALSE, stroke := "blue") %>%
layer_smooths(stroke:="red", se = FALSE)
plotlylibrary(plotly)
p2 <- ggplotly(p)
p2
library(car)
scatterplotMatrix(~ Sales + TV + Radio + Newspaper, data = AD)
Recall mod1
mod1 <- lm(Sales ~ TV, data = AD)
summary(mod1)
##
## Call:
## lm(formula = Sales ~ TV, data = AD)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.3860 -1.9545 -0.1913 2.0671 7.2124
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.032594 0.457843 15.36 <2e-16 ***
## TV 0.047537 0.002691 17.67 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.259 on 198 degrees of freedom
## Multiple R-squared: 0.6119, Adjusted R-squared: 0.6099
## F-statistic: 312.1 on 1 and 198 DF, p-value: < 2.2e-16
Residual: \(e_i = y_i - \hat{y_i}\)
To obtain the residuals for mod1 use the function resid on a linear model object.
eis <- resid(mod1)
RSS <- sum(eis^2)
RSS
## [1] 2102.531
RSE <- sqrt(RSS/(dim(AD)[1]-2))
RSE
## [1] 3.258656
# Or
summary(mod1)$sigma
## [1] 3.258656
The least squares estimators of \(\beta_0\) and \(\beta_1\) are
\[b_0 = \hat{\beta_0} = \bar{y} - b_1\bar{x}\] \[b_1 = \hat{\beta_1} = \frac{\sum_{i = 1}^n(x_i - \bar{x})(y_i - \bar{y})}{\sum_{i=1}^n(x_i-\bar{x})^2}\]
y <- AD$Sales
x <- AD$TV
b1 <- sum( (x - mean(x))*(y - mean(y)) ) / sum((x - mean(x))^2)
b0 <- mean(y) - b1*mean(x)
c(b0, b1)
## [1] 7.03259355 0.04753664
# Or using
coef(mod1)
## (Intercept) TV
## 7.03259355 0.04753664
summary(mod1)
##
## Call:
## lm(formula = Sales ~ TV, data = AD)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.3860 -1.9545 -0.1913 2.0671 7.2124
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.032594 0.457843 15.36 <2e-16 ***
## TV 0.047537 0.002691 17.67 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.259 on 198 degrees of freedom
## Multiple R-squared: 0.6119, Adjusted R-squared: 0.6099
## F-statistic: 312.1 on 1 and 198 DF, p-value: < 2.2e-16
XTXI <- summary(mod1)$cov.unscaled
MSE <- summary(mod1)$sigma^2
var.cov.b <- MSE*XTXI
var.cov.b
## (Intercept) TV
## (Intercept) 0.209620158 -1.064495e-03
## TV -0.001064495 7.239367e-06
seb0 <- sqrt(var.cov.b[1, 1])
seb1 <- sqrt(var.cov.b[2, 2])
c(seb0, seb1)
## [1] 0.457842940 0.002690607
coef(summary(mod1))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.03259355 0.457842940 15.36028 1.40630e-35
## TV 0.04753664 0.002690607 17.66763 1.46739e-42
coef(summary(mod1))[1, 2]
## [1] 0.4578429
coef(summary(mod1))[2, 2]
## [1] 0.002690607
tb0 <- b0/seb0
tb1 <- b1/seb1
c(tb0, tb1)
## [1] 15.36028 17.66763
pvalues <- c(pt(tb0, 198, lower = FALSE)*2, pt(tb1, 198, lower = FALSE)*2)
pvalues
## [1] 1.40630e-35 1.46739e-42
coef(summary(mod1))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.03259355 0.457842940 15.36028 1.40630e-35
## TV 0.04753664 0.002690607 17.66763 1.46739e-42
TSS <- sum((y - mean(y))^2)
c(RSS, TSS)
## [1] 2102.531 5417.149
R2 <- (TSS - RSS)/TSS
R2
## [1] 0.6118751
# Or
summary(mod1)$r.squared
## [1] 0.6118751
\[\text{CI}_{1 - \alpha}(\beta_1) = \left[b_1 - t_{1- \alpha/2, n - p + 1}SE(b1), b_1 + t_{1- \alpha/2, n - p + 1}SE(b1) \right]\]
Example: Construct a 90% confidence interval for \(\beta_1\).
alpha <- 0.10
ct <- qt(1 - alpha/2, 198)
ct
## [1] 1.652586
b1 +c(-1, 1)*ct*seb1
## [1] 0.04309018 0.05198310
# Or
confint(mod1, parm = "TV", level = 0.90)
## 5 % 95 %
## TV 0.04309018 0.0519831
confint(mod1)
## 2.5 % 97.5 %
## (Intercept) 6.12971927 7.93546783
## TV 0.04223072 0.05284256
Solution of linear systems Find the solution(s) if any to the following linear equations.
\[2x + y - z = 8\] \[-3x - y + 2z = -11\] \[-2x + y + 2z = -3\]
A <- matrix(c(2, -3, -2, 1, -1, 1, -1, 2, 2), nrow = 3)
b <- matrix(c(8, -11, -3), nrow = 3)
x <- solve(A)%*%b
x
## [,1]
## [1,] 2
## [2,] 3
## [3,] -1
# Or
solve(A, b)
## [,1]
## [1,] 2
## [2,] 3
## [3,] -1
See wikipedia for a review of matrix multiplication rules and properties.
Consider the 2 \(\times\) 2 matrix \(A\).
\[A = \begin{bmatrix} 2 & 4 \\ 9 & 5 \\ \end{bmatrix} \]
A <- matrix(c(2, 9, 4, 5), nrow = 2)
A
## [,1] [,2]
## [1,] 2 4
## [2,] 9 5
t(A) # Transpose of A
## [,1] [,2]
## [1,] 2 9
## [2,] 4 5
t(A)%*%A # A'A
## [,1] [,2]
## [1,] 85 53
## [2,] 53 41
solve(A)%*%A # I_2
## [,1] [,2]
## [1,] 1.000000e+00 0
## [2,] 1.110223e-16 1
zapsmall(solve(A)%*%A) # What you expect I_2
## [,1] [,2]
## [1,] 1 0
## [2,] 0 1
X <- model.matrix(mod1)
XTX <- t(X)%*%X
dim(XTX)
## [1] 2 2
XTXI <- solve(XTX)
XTXI
## (Intercept) TV
## (Intercept) 0.0197403984 -1.002458e-04
## TV -0.0001002458 6.817474e-07
# But it is best to compute this quantity using
summary(mod1)$cov.unscaled
## (Intercept) TV
## (Intercept) 0.0197403984 -1.002458e-04
## TV -0.0001002458 6.817474e-07
betahat <- XTXI%*%t(X)%*%y
betahat
## [,1]
## (Intercept) 7.03259355
## TV 0.04753664
coef(mod1)
## (Intercept) TV
## 7.03259355 0.04753664
XTXI <- summary(mod1)$cov.unscaled
MSE <- summary(mod1)$sigma^2
var.cov.b <- MSE*XTXI
var.cov.b
## (Intercept) TV
## (Intercept) 0.209620158 -1.064495e-03
## TV -0.001064495 7.239367e-06
mod2 <- lm(Sales ~ TV + Radio, data = AD)
summary(mod2)
##
## Call:
## lm(formula = Sales ~ TV + Radio, data = AD)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.7977 -0.8752 0.2422 1.1708 2.8328
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.92110 0.29449 9.919 <2e-16 ***
## TV 0.04575 0.00139 32.909 <2e-16 ***
## Radio 0.18799 0.00804 23.382 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.681 on 197 degrees of freedom
## Multiple R-squared: 0.8972, Adjusted R-squared: 0.8962
## F-statistic: 859.6 on 2 and 197 DF, p-value: < 2.2e-16
mod3 <- lm(Sales ~ TV + Radio + Newspaper, data = AD)
summary(mod3)
##
## Call:
## lm(formula = Sales ~ TV + Radio + Newspaper, data = AD)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.8277 -0.8908 0.2418 1.1893 2.8292
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.938889 0.311908 9.422 <2e-16 ***
## TV 0.045765 0.001395 32.809 <2e-16 ***
## Radio 0.188530 0.008611 21.893 <2e-16 ***
## Newspaper -0.001037 0.005871 -0.177 0.86
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.686 on 196 degrees of freedom
## Multiple R-squared: 0.8972, Adjusted R-squared: 0.8956
## F-statistic: 570.3 on 3 and 196 DF, p-value: < 2.2e-16
\[H_0: \beta_1 = \beta_2 = \beta_3 = 0\] versus the alternative \[H_1: \text{at least one } \beta_j \neq 0\]
The test statistic is \(F = \frac{(\text{TSS} - \text{RSS})/p}{\text{RSS}/(n-p-1)}\)
anova(mod3)
## Analysis of Variance Table
##
## Response: Sales
## Df Sum Sq Mean Sq F value Pr(>F)
## TV 1 3314.6 3314.6 1166.7308 <2e-16 ***
## Radio 1 1545.6 1545.6 544.0501 <2e-16 ***
## Newspaper 1 0.1 0.1 0.0312 0.8599
## Residuals 196 556.8 2.8
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
SSR <- sum(anova(mod3)[1:3, 2])
MSR <- SSR/3
SSE <- anova(mod3)[4, 2]
MSE <- SSE/(200-3-1)
Fobs <- MSR/MSE
Fobs
## [1] 570.2707
pvalue <- pf(Fobs, 3, 196, lower = FALSE)
pvalue
## [1] 1.575227e-96
# Or
summary(mod3)
##
## Call:
## lm(formula = Sales ~ TV + Radio + Newspaper, data = AD)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.8277 -0.8908 0.2418 1.1893 2.8292
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.938889 0.311908 9.422 <2e-16 ***
## TV 0.045765 0.001395 32.809 <2e-16 ***
## Radio 0.188530 0.008611 21.893 <2e-16 ***
## Newspaper -0.001037 0.005871 -0.177 0.86
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.686 on 196 degrees of freedom
## Multiple R-squared: 0.8972, Adjusted R-squared: 0.8956
## F-statistic: 570.3 on 3 and 196 DF, p-value: < 2.2e-16
summary(mod3)$fstatistic
## value numdf dendf
## 570.2707 3.0000 196.0000
Suppose we would like to test whether \(\beta_2 = \beta_3 = 0\). The reduced model with \(\beta_2 = \beta_3 = 0\) is mod1 while the full model is mod3.
summary(mod3)
##
## Call:
## lm(formula = Sales ~ TV + Radio + Newspaper, data = AD)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.8277 -0.8908 0.2418 1.1893 2.8292
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.938889 0.311908 9.422 <2e-16 ***
## TV 0.045765 0.001395 32.809 <2e-16 ***
## Radio 0.188530 0.008611 21.893 <2e-16 ***
## Newspaper -0.001037 0.005871 -0.177 0.86
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.686 on 196 degrees of freedom
## Multiple R-squared: 0.8972, Adjusted R-squared: 0.8956
## F-statistic: 570.3 on 3 and 196 DF, p-value: < 2.2e-16
anova(mod1, mod3)
## Analysis of Variance Table
##
## Model 1: Sales ~ TV
## Model 2: Sales ~ TV + Radio + Newspaper
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 198 2102.53
## 2 196 556.83 2 1545.7 272.04 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod.fs <- lm(Sales ~ 1, data = AD)
SCOPE <- (~. + TV + Radio + Newspaper)
add1(mod.fs, scope = SCOPE, test = "F")
## Single term additions
##
## Model:
## Sales ~ 1
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 5417.1 661.80
## TV 1 3314.6 2102.5 474.52 312.145 < 2.2e-16 ***
## Radio 1 1798.7 3618.5 583.10 98.422 < 2.2e-16 ***
## Newspaper 1 282.3 5134.8 653.10 10.887 0.001148 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod.fs <- update(mod.fs, .~. + TV)
add1(mod.fs, scope = SCOPE, test = "F")
## Single term additions
##
## Model:
## Sales ~ TV
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 2102.53 474.52
## Radio 1 1545.62 556.91 210.82 546.74 < 2.2e-16 ***
## Newspaper 1 183.97 1918.56 458.20 18.89 2.217e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod.fs <- update(mod.fs, .~. + Radio)
add1(mod.fs, scope = SCOPE, test = "F")
## Single term additions
##
## Model:
## Sales ~ TV + Radio
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 556.91 210.82
## Newspaper 1 0.088717 556.83 212.79 0.0312 0.8599
summary(mod.fs)
##
## Call:
## lm(formula = Sales ~ TV + Radio, data = AD)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.7977 -0.8752 0.2422 1.1708 2.8328
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.92110 0.29449 9.919 <2e-16 ***
## TV 0.04575 0.00139 32.909 <2e-16 ***
## Radio 0.18799 0.00804 23.382 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.681 on 197 degrees of freedom
## Multiple R-squared: 0.8972, Adjusted R-squared: 0.8962
## F-statistic: 859.6 on 2 and 197 DF, p-value: < 2.2e-16
stepAICstepAIC(lm(Sales ~ 1, data = AD), scope = .~TV + Radio + Newspaper, direction = "forward", test = "F")
## Start: AIC=661.8
## Sales ~ 1
##
## Df Sum of Sq RSS AIC F Value Pr(F)
## + TV 1 3314.6 2102.5 474.52 312.145 < 2.2e-16 ***
## + Radio 1 1798.7 3618.5 583.10 98.422 < 2.2e-16 ***
## + Newspaper 1 282.3 5134.8 653.10 10.887 0.001148 **
## <none> 5417.1 661.80
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=474.52
## Sales ~ TV
##
## Df Sum of Sq RSS AIC F Value Pr(F)
## + Radio 1 1545.62 556.91 210.82 546.74 < 2.2e-16 ***
## + Newspaper 1 183.97 1918.56 458.20 18.89 2.217e-05 ***
## <none> 2102.53 474.52
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=210.82
## Sales ~ TV + Radio
##
## Df Sum of Sq RSS AIC F Value Pr(F)
## <none> 556.91 210.82
## + Newspaper 1 0.088717 556.83 212.79 0.031228 0.8599
##
## Call:
## lm(formula = Sales ~ TV + Radio, data = AD)
##
## Coefficients:
## (Intercept) TV Radio
## 2.92110 0.04575 0.18799
mod.be <- lm(Sales ~ TV + Radio + Newspaper, data = AD)
drop1(mod.be, test = "F")
## Single term deletions
##
## Model:
## Sales ~ TV + Radio + Newspaper
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 556.8 212.79
## TV 1 3058.01 3614.8 584.90 1076.4058 <2e-16 ***
## Radio 1 1361.74 1918.6 458.20 479.3252 <2e-16 ***
## Newspaper 1 0.09 556.9 210.82 0.0312 0.8599
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod.be <- update(mod.be, .~. - Newspaper)
drop1(mod.be, test = "F")
## Single term deletions
##
## Model:
## Sales ~ TV + Radio
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 556.9 210.82
## TV 1 3061.6 3618.5 583.10 1082.98 < 2.2e-16 ***
## Radio 1 1545.6 2102.5 474.52 546.74 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(mod.be)
##
## Call:
## lm(formula = Sales ~ TV + Radio, data = AD)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.7977 -0.8752 0.2422 1.1708 2.8328
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.92110 0.29449 9.919 <2e-16 ***
## TV 0.04575 0.00139 32.909 <2e-16 ***
## Radio 0.18799 0.00804 23.382 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.681 on 197 degrees of freedom
## Multiple R-squared: 0.8972, Adjusted R-squared: 0.8962
## F-statistic: 859.6 on 2 and 197 DF, p-value: < 2.2e-16
stepAICstepAIC(lm(Sales ~ TV + Radio + Newspaper, data = AD), scope = .~TV + Radio + Newspaper, direction = "backward", test = "F")
## Start: AIC=212.79
## Sales ~ TV + Radio + Newspaper
##
## Df Sum of Sq RSS AIC F Value Pr(F)
## - Newspaper 1 0.09 556.9 210.82 0.03 0.8599
## <none> 556.8 212.79
## - Radio 1 1361.74 1918.6 458.20 479.33 <2e-16 ***
## - TV 1 3058.01 3614.8 584.90 1076.41 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: AIC=210.82
## Sales ~ TV + Radio
##
## Df Sum of Sq RSS AIC F Value Pr(F)
## <none> 556.9 210.82
## - Radio 1 1545.6 2102.5 474.52 546.74 < 2.2e-16 ***
## - TV 1 3061.6 3618.5 583.10 1082.98 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Call:
## lm(formula = Sales ~ TV + Radio, data = AD)
##
## Coefficients:
## (Intercept) TV Radio
## 2.92110 0.04575 0.18799
residualPlots(mod2)
## Test stat Pr(>|t|)
## TV -6.775 0.000
## Radio 1.054 0.293
## Tukey test 7.635 0.000
qqPlot(mod2)
influenceIndexPlot(mod2)
We use a confidence interval to quantify the uncertainty surrounding the average Sales over a large number of cities. For example, given that $100,000 is spent on TV advertising and $20,000 is spent on Radio advertising in each city, the 95% confidence interval is [10.9852544, 11.5276775]. We interpret this to mean that 95% of intervals of this form will contain the true value of Sales.
predict(mod.be, newdata = data.frame(TV = 100, Radio = 20), interval = "conf")
## fit lwr upr
## 1 11.25647 10.98525 11.52768
On the other hand, a prediction interval can be used to quantify the uncertainty surrounding Sales for a particular city. Given that $100,000 is spent on TV advertising and $20,000 is spent on Radio advertising in a particular city, the 95% prediction interval is [7.9296161, 14.5833158]. We interpret this to mean that 95% of intervals of this form will contain the true value of Sales for this city.
predict(mod.be, newdata = data.frame(TV = 100, Radio = 20), interval = "pred")
## fit lwr upr
## 1 11.25647 7.929616 14.58332
Note that both the intervals are centered at 11.256466, but that the prediction interval is substantially wider than the confidence interval, reflecting the increased uncertainty about Sales for a given city in comparison to the average Sales over many locations.
Credit <- read.csv("http://www-bcf.usc.edu/~gareth/ISL/Credit.csv")
datatable(Credit[, -1], rownames = FALSE)
str(Credit)
## 'data.frame': 400 obs. of 12 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Income : num 14.9 106 104.6 148.9 55.9 ...
## $ Limit : int 3606 6645 7075 9504 4897 8047 3388 7114 3300 6819 ...
## $ Rating : int 283 483 514 681 357 569 259 512 266 491 ...
## $ Cards : int 2 3 4 3 2 4 2 2 5 3 ...
## $ Age : int 34 82 71 36 68 77 37 87 66 41 ...
## $ Education: int 11 15 11 11 16 10 12 9 13 19 ...
## $ Gender : Factor w/ 2 levels "Female"," Male": 2 1 2 1 2 2 1 2 1 1 ...
## $ Student : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 1 1 1 2 ...
## $ Married : Factor w/ 2 levels "No","Yes": 2 2 1 1 2 1 1 1 1 2 ...
## $ Ethnicity: Factor w/ 3 levels "African American",..: 3 2 2 2 3 3 1 2 3 1 ...
## $ Balance : int 333 903 580 964 331 1151 203 872 279 1350 ...